##### Georgia WRU package ; Personal Identifying Info script; keep secret 
library(devtools)
library(foreign)
library(rgdal)
library(sp)
library(dplyr)
library(wru)
library(ggplot2)
library(gridExtra)
library(stringi)
library(stringr)
library(tidyverse)
library(ggpubr)###
library(data.table)
library(svMisc)
devtools::install_github("https://github.com/jcuriel-unc/zipWRUext",subdir="zipWRUext2")
library(zipWRUext2)
options(stringsAsFactors = FALSE)

###setting directory ; commented out, though use for Rstudio             
#ga_wd <- dirname(rstudioapi::getActiveDocumentContext()$path)
#setwd(ga_wd)

###read in the non-pii data 
ga_voterfile2 <- readRDS("ga_voterfile_geocoded_postpii.rds")



#####let's do zip code here 
ga_voterfile2 <- zip_wru(ga_voterfile2, state="GEORGIA", type1="acs", year1="2018", 
                        zip_col="residence_zipcode", surname_field = "surname")
#sum(is.na(ga_voterfile2$pred.whi)) # only 5760 missing with zip codes

###change name for zip preds
pred_names <- colnames(ga_voterfile2[,grep("pred.", colnames(ga_voterfile2))])
pred_names
for (i in 21:25) {
  colnames(ga_voterfile2)[colnames(ga_voterfile2)==pred_names[i]] <-
    paste0(pred_names[i],sep="_","zip2018")
}
###now let's do zip codes 2010 
ga_voterfile2 <- zip_wru(ga_voterfile2, state="GEORGIA", type1="census", year1="2010", 
                        zip_col="zcta5", surname_field = "surname")
sum(is.na(ga_voterfile2$pred.whi)) # only 5760 missing with zip codes
pred_names <- colnames(ga_voterfile2[,grep("pred.", colnames(ga_voterfile2))])
pred_names
for (i in 26:30) {
  colnames(ga_voterfile2)[colnames(ga_voterfile2)==pred_names[i]] <-
    paste0(pred_names[i],sep="_","zip2010")
}

###let's subset here the NA values 

ga_voterfile2 <- subset(ga_voterfile2, is.na(pred.whi_blocks)==F & is.na(pred.whi_zip2018)==F)

###save the data here 
saveRDS(ga_voterfile2, "ga_voterfile_geocoded_postpii_final1.rds")



####with this, move onto the resampling. Will need to look at the race codes 
table(ga_voterfile2$race)
table(ga_voterfile2$race_desc)
ga_voterfile2$race_desc <- str_to_upper(ga_voterfile2$race_desc)
seeds_vec <- readRDS("seeds_vec.rds")
###will now subset into unaffiliated and present samples 
ga_voterfile2a <- subset(ga_voterfile2, race_desc != "" & race_desc != "UNKNOWN")
ga_voterfile2a <- subset(ga_voterfile2a, is.na(pred.whi_blocks)==F & is.na(pred.whi_tract)==F &
                           is.na(pred.whi_surname)==F & is.na(pred.whi_zip2018)==F & 
                           is.na(pred.whi_zip2010)==F & is.na(pred.whi_county)==F)
###looking at unaffiliated 
ga_voterfile2unaf <- subset(ga_voterfile2, race_desc=="UNKNOWN")
###what is prediction for these? 
sum(ga_voterfile2unaf$pred.whi_zip2018,na.rm=T)/nrow(ga_voterfile2unaf)
sum(ga_voterfile2$pred.whi_zip2018,na.rm=T)/nrow(ga_voterfile2)

###check the mis geocoded addrs. 
###get unique id 
ga_voterfile2$goecode <- paste0(ga_voterfile2$county,sep="_",ga_voterfile2$tract,sep="_",ga_voterfile2$block)
length(unique(ga_voterfile2$goecode))
ga_voterfile2$goecodetract <- paste0(ga_voterfile2$county,sep="_",ga_voterfile2$tract)
length(unique(ga_voterfile2$goecodetract))

###let's save sample set 

saveRDS(ga_voterfile2a, "ga_voterfile_geocoded2sample_set.rds")
####Set up the matrices here
whi_results_matrix <- matrix(NA, ncol=7, nrow = 10000)
bla_results_matrix <- matrix(NA, ncol=7, nrow = 10000)
his_results_matrix <- matrix(NA, ncol=7, nrow = 10000)
asi_results_matrix <- matrix(NA, ncol=7, nrow = 10000)
oth_results_matrix <- matrix(NA, ncol=7, nrow = 10000)
####loop here 
for(i in 1:10000){
  svMisc::progress((i/10000)*100)
  set.seed(seeds_vec[i])
  temp_sample <- sample_n(ga_voterfile2a, 1000)
  whi_results_matrix[i,1] <- length(which(temp_sample$race_desc=="WHITE NOT OF HISPANIC ORIGIN"))
  bla_results_matrix[i,1] <- length(which(temp_sample$race_desc=="BLACK NOT OF HISPANIC ORIGIN"))
  his_results_matrix[i,1] <- length(which(temp_sample$race_desc=="HISPANIC"))
  asi_results_matrix[i,1] <- length(which(temp_sample$race_desc=="ASIAN OR PACIFIC ISLANDER"))
  oth_results_matrix[i,1] <- 1000 - whi_results_matrix[i,1] -  bla_results_matrix[i,1] - asi_results_matrix[i,1] -
    his_results_matrix[i,1]
  ####now let'sget the sums for the other races 
  whi_results_matrix[i,4] <- sum(temp_sample$pred.whi_zip2010)
  bla_results_matrix[i,4] <- sum(temp_sample$pred.bla_zip2010)
  his_results_matrix[i,4] <- sum(temp_sample$pred.his_zip2010)
  asi_results_matrix[i,4] <- sum(temp_sample$pred.asi_zip2010)
  oth_results_matrix[i,4] <- 1000 - whi_results_matrix[i,4] - bla_results_matrix[i,4] - asi_results_matrix[i,4]-
    his_results_matrix[i,4]
  ###let's do tract 2018 
  whi_results_matrix[i,5] <- sum(temp_sample$pred.whi_zip2018)
  bla_results_matrix[i,5] <- sum(temp_sample$pred.bla_zip2018)
  his_results_matrix[i,5] <- sum(temp_sample$pred.his_zip2018)
  asi_results_matrix[i,5] <- sum(temp_sample$pred.asi_zip2018)
  oth_results_matrix[i,5] <- 1000 - whi_results_matrix[i,5] - bla_results_matrix[i,5] - asi_results_matrix[i,5] -
    his_results_matrix[i,5]
  
  ##### now tract 
  whi_results_matrix[i,6] <- sum(temp_sample$pred.whi_tract)
  bla_results_matrix[i,6] <- sum(temp_sample$pred.bla_tract)
  his_results_matrix[i,6] <- sum(temp_sample$pred.his_tract)
  asi_results_matrix[i,6] <- sum(temp_sample$pred.asi_tract)
  oth_results_matrix[i,6] <- 1000 - whi_results_matrix[i,5] - bla_results_matrix[i,5] - asi_results_matrix[i,5] -
    his_results_matrix[i,6]
  ###
  whi_results_matrix[i,3] <- sum(temp_sample$pred.whi_county)
  bla_results_matrix[i,3] <- sum(temp_sample$pred.bla_county)
  his_results_matrix[i,3] <- sum(temp_sample$pred.his_county)
  asi_results_matrix[i,3] <- sum(temp_sample$pred.asi_county)
  oth_results_matrix[i,3] <- 1000 - whi_results_matrix[i,3] - bla_results_matrix[i,3] - asi_results_matrix[i,3] -
    his_results_matrix[i,3] 
  ####
  whi_results_matrix[i,2] <- sum(temp_sample$pred.whi_surname)
  bla_results_matrix[i,2] <- sum(temp_sample$pred.bla_surname)
  his_results_matrix[i,2] <- sum(temp_sample$pred.his_surname)
  asi_results_matrix[i,2] <- sum(temp_sample$pred.asi_surname)
  oth_results_matrix[i,2] <- 1000 - whi_results_matrix[i,2] - bla_results_matrix[i,2] - asi_results_matrix[i,2] -
    his_results_matrix[i,2]
  ###let's do block
  whi_results_matrix[i,7] <- sum(temp_sample$pred.whi_blocks)
  bla_results_matrix[i,7] <- sum(temp_sample$pred.bla_blocks)
  his_results_matrix[i,7] <- sum(temp_sample$pred.his_blocks)
  asi_results_matrix[i,7] <- sum(temp_sample$pred.asi_blocks)
  oth_results_matrix[i,7] <- 1000 - whi_results_matrix[i,7] - bla_results_matrix[i,7] - asi_results_matrix[i,7] -
    his_results_matrix[i,7]
}
####cleaning and combining data here 
whi_results_matrix <- as.data.frame(whi_results_matrix)
bla_results_matrix <- as.data.frame(bla_results_matrix)
his_results_matrix <- as.data.frame(his_results_matrix)
asi_results_matrix <- as.data.frame(asi_results_matrix)
oth_results_matrix <- as.data.frame(oth_results_matrix)
###setting column names 
colnames(whi_results_matrix) <- c("empirical","surname","county","zip2010","zip2018", "tract", "blocks")
colnames(bla_results_matrix) <- c("empirical","surname","county","zip2010","zip2018", "tract", "blocks")
colnames(his_results_matrix) <- c("empirical","surname","county","zip2010","zip2018", "tract", "blocks")
colnames(asi_results_matrix) <- c("empirical","surname","county","zip2010","zip2018", "tract", "blocks")
colnames(oth_results_matrix) <- c("empirical","surname","county","zip2010","zip2018", "tract", "blocks")
###let's assign race, then move onto save 
whi_results_matrix$race <- "white"
bla_results_matrix$race <- "black"
his_results_matrix$race <- "hispanic"
asi_results_matrix$race <- "asian"
oth_results_matrix$race <- "other"
##combine 
final_race_results <- rbind(whi_results_matrix,bla_results_matrix,his_results_matrix,
                            asi_results_matrix,oth_results_matrix)
saveRDS(final_race_results, "race_resampled_results_ga.rds")

###getting distributions of errors and such 
###let's make into a long data set , which wil be the differences 
final_race_results$surname_diff <- 
  (abs(final_race_results$empirical-final_race_results$surname)/final_race_results$empirical)*100
final_race_results$county_diff <- 
  (abs(final_race_results$empirical-final_race_results$county)/final_race_results$empirical)*100
final_race_results$zip2010_diff <- 
  (abs(final_race_results$empirical-final_race_results$zip2010)/final_race_results$empirical)*100
final_race_results$zip2018_diff <- 
  (abs(final_race_results$empirical-final_race_results$zip2018)/final_race_results$empirical)*100
final_race_results$tract_diff <- 
  (abs(final_race_results$empirical-final_race_results$tract)/final_race_results$empirical)*100
final_race_results$blocks_diff <- 
  (abs(final_race_results$empirical-final_race_results$blocks)/final_race_results$empirical)*100
####let's now make into long 
final_race_results_long <- subset(final_race_results, select=c(race,surname_diff,county_diff,
                                                               zip2010_diff,zip2018_diff,tract_diff,
                                                               blocks_diff))
final_race_results_long <- gather(final_race_results_long, key="method",value="abs_diff",surname_diff:blocks_diff)
saveRDS(final_race_results_long, "resampled_race_diffs_long.rds")
saveRDS(final_race_results, "resampled_race_diffs_wide.rds")
final_race_results_long <-readRDS("resampled_race_diffs_long.rds")
final_race_results <- readRDS("resampled_race_diffs_wide.rds")
####now move onto the plots 
###lets try density 
white_density <- ggplot(data=subset(final_race_results_long,race=="white" & 
                                      (method=="zip2018_diff" | method=="county_diff" | method=="blocks_diff" ) ),
                        aes(x=abs_diff,fill=method)) + theme_minimal() +
  geom_density(alpha=0.2) + xlim(0,150) +  ylim(0,.2) +
  theme(legend.text=element_text(size=8),legend.title = element_text(size=10))+
  labs(title="White",y="Density",x="% Difference from reported race", fill="Method") + 
  scale_fill_discrete(name="Method", labels=c("Blocks","County","ZIP codes, 2018 ACS")) 
white_density
#####now for black density
black_density <- ggplot(data=subset(final_race_results_long,race=="black" & 
                                      (method=="zip2018_diff" | method=="county_diff" | method=="blocks_diff" ) ),
                        aes(x=abs_diff,fill=method)) + theme_minimal() +
  theme(legend.text=element_text(size=8),legend.title = element_text(size=10))+
  geom_density(alpha=0.2) + xlim(0,150) + ylim(0,.2) +
  labs(title="Black",y="Density",x="% Difference from reported race", fill="Method") + 
  scale_fill_discrete(name="Method", labels=c("Blocks","County","ZIP codes, 2018 ACS")) 
black_density
#####now for asian ; seems like we can exclude; uninformative 
asian_density <- ggplot(data=subset(final_race_results_long,race=="asian" & 
                                      (method=="zip2018_diff" | method=="county_diff" | method=="blocks_diff" ) ),
                        aes(x=abs_diff,fill=method)) + theme_minimal() +
  theme(legend.text=element_text(size=8),legend.title = element_text(size=10))+
  geom_density(alpha=0.2)  + ylim(0,.2) + xlim(0,150) +
  labs(title="Asian",y="Density",x="% Difference from reported race", fill="Method") + 
  scale_fill_discrete(name="Method", labels=c("Blocks","County","ZIP codes, 2018 ACS")) 
asian_density

####Let's do hispanic next 
hispanic_density <- ggplot(data=subset(final_race_results_long,race=="hispanic" & 
                                         (method=="zip2018_diff" | method=="county_diff" | method=="blocks_diff" ) ),
                           aes(x=abs_diff,fill=method)) + theme_minimal() +
  theme(legend.text=element_text(size=8),legend.title = element_text(size=10))+
  geom_density(alpha=0.2)  + ylim(0,.2) + xlim(0,150) +
  labs(title="Hispanic",y="Density",x="% Difference from reported race", fill="Method") + 
  scale_fill_discrete(name="Method", labels=c("Blocks","County","ZIP codes, 2018 ACS")) 
hispanic_density
####other 
other_density <- ggplot(data=subset(final_race_results_long,race=="other" & 
                                      (method=="zip2018_diff" | method=="county_diff" | method=="blocks_diff" ) ),
                        aes(x=abs_diff,fill=method)) + theme_minimal() +
  theme(legend.text=element_text(size=8),legend.title = element_text(size=10))+
  geom_density(alpha=0.2)  + ylim(0,.2) +  xlim(0,150) + 
  labs(title="Other",y="Density",x="% Difference from reported race", fill="Method") + 
  scale_fill_discrete(name="Method", labels=c("Blocks","County","ZIP codes, 2018 ACS")) 
other_density

###let's see if we can do a common legend 

density_plots2 <- ggarrange(white_density,black_density,asian_density,hispanic_density,
                            other_density,common.legend=TRUE,ncol=2,nrow=3,
                            legend="bottom")
density_plots2

ggsave("races_density_plots.png" ,plot=density_plots2, scale=1,width=9,height=12,units = c("in"),dpi=600 )

####now onto the table, that we'll likely place in the appendix 
race_table <- final_race_results %>% 
  group_by(race) %>% 
  dplyr::summarise(surname95=quantile(surname_diff,probs=c(0.95)),
                   county95=quantile(county_diff,probs=c(0.95)),
                   zip2010_95=quantile(zip2010_diff, probs=c(0.95)), zip2018_95=quantile(zip2018_diff,probs=c(0.95)),
                   tract95=quantile(tract_diff,probs=c(0.95)),blocks95=quantile(blocks_diff,probs=c(0.95)) )
race_table <- as.data.frame(race_table)
race_table
race_tableT <- t(race_table)
race_tableT
race_tableT <- as.data.frame(race_tableT)
race_tableT <- race_tableT[-1,]
colnames(race_tableT) <- c("Asian","Black","Hispanic","Other","White")
###col of names 
race_names <- as.data.frame(matrix(NA,nrow=6,ncol=1))
race_names[1,1] <- "Surname"
race_names[2,1] <- "County"
race_names[3,1] <- "ZCTA 2010"
race_names[4,1] <- "ZCTA 2018"
race_names[5,1] <- "Tract"
race_names[6,1] <- "Block"
###bind data 
race_tableT <- cbind(race_names,race_tableT)
names(race_tableT)
colnames(race_tableT)[1] <- "Level"
class(race_tableT)
race_tableT
for(i in 2:6){
  race_tableT[,i] <- round( as.numeric(race_tableT[,i]),2)
}

###now get the names fixed 
str(race_tableT)
race_tableT###Save data 

###transpose table save
saveRDS(race_tableT, "race_error_table95ci.rds")
write.csv(race_tableT, "race_error_table95ci.csv",row.names = F)

###getting the count diffs per 1000 

final_race_results$surname_diff_count <- 
  abs(final_race_results$empirical-final_race_results$surname)
final_race_results$county_diff_count <- 
  abs(final_race_results$empirical-final_race_results$county)
final_race_results$zip2010_diff_count <- 
  abs(final_race_results$empirical-final_race_results$zip2010)
final_race_results$zip2018_diff_count <- 
  abs(final_race_results$empirical-final_race_results$zip2018)
final_race_results$tract_diff_count <- 
  abs(final_race_results$empirical-final_race_results$tract)
final_race_results$blocks_diff_count <- 
  abs(final_race_results$empirical-final_race_results$blocks)

###let's make this into a long dataset, and then also produce the table like we did above. 
final_race_results_count_long <- subset(final_race_results, select=c(race,surname_diff_count,county_diff_count,
                                                                     zip2010_diff_count,zip2018_diff_count,
                                                                     tract_diff_count,
                                                                     blocks_diff_count))
final_race_results_count_long <- gather(final_race_results_count_long, key="method",value="abs_count_diff",
                                        surname_diff_count:blocks_diff_count)


####now move onto the plots 
###lets try density 
white_density_c <- ggplot(data=subset(final_race_results_count_long,race=="white" & 
                                        (method=="zip2018_diff_count" | method=="county_diff_count" | 
                                           method=="blocks_diff_count" ) ),
                          aes(x=abs_count_diff,fill=method)) + theme_minimal() +
  geom_density(alpha=0.2) + xlim(0,150) +  ylim(0,.125) +
  theme(legend.text=element_text(size=8),legend.title = element_text(size=10))+
  labs(title="White",y="Density",x="Abs. Diff. from reported race, per 1,000", fill="Method") + 
  scale_fill_discrete(name="Method", labels=c("Blocks","County","ZIP codes, 2018 ACS")) 
white_density_c
#####now for black density
black_density_c <- ggplot(data=subset(final_race_results_count_long,race=="black" & 
                                        (method=="zip2018_diff_count" | method=="county_diff_count" | 
                                           method=="blocks_diff_count" ) ),
                          aes(x=abs_count_diff,fill=method)) + theme_minimal() +
  theme(legend.text=element_text(size=8),legend.title = element_text(size=10))+
  geom_density(alpha=0.2) + xlim(0,150) + ylim(0,.125) +
  labs(title="Black",y="Density",x="Abs. Diff. from reported race, per 1,000", fill="Method") + 
  scale_fill_discrete(name="Method", labels=c("Blocks","County","ZIP codes, 2018 ACS")) 
black_density_c
#####now for asian ; seems like we can exclude; uninformative & 
asian_density_c <- ggplot(data=subset(final_race_results_count_long,race=="asian" & 
                                        (method=="zip2018_diff_count" | method=="county_diff_count" | 
                                           method=="blocks_diff_count" ) ),
                          aes(x=abs_count_diff,fill=method)) + theme_minimal() +
  theme(legend.text=element_text(size=8),legend.title = element_text(size=10))+
  geom_density(alpha=0.2) + xlim(0,150) + ylim(0,.125) +
  labs(title="Asian",y="Density",x="Abs. Diff. from reported race, per 1,000", fill="Method") + 
  scale_fill_discrete(name="Method", labels=c("Blocks","County","ZIP codes, 2018 ACS")) 
asian_density_c

####Let's do hispanic next 
hispanic_density_c <- ggplot(data=subset(final_race_results_count_long,race=="hispanic" & 
                                           (method=="zip2018_diff_count" | method=="county_diff_count" | 
                                              method=="blocks_diff_count" ) ),
                             aes(x=abs_count_diff,fill=method)) + theme_minimal() +
  theme(legend.text=element_text(size=8),legend.title = element_text(size=10))+
  geom_density(alpha=0.2)  + ylim(0,.125) + xlim(0,150) +
  labs(title="Hispanic",y="Density",x="Abs. Diff. from reported race, per 1,000", fill="Method") + 
  scale_fill_discrete(name="Method", labels=c("Blocks","County","ZIP codes, 2018 ACS")) 
hispanic_density_c
####other 
other_density_c <- ggplot(data=subset(final_race_results_count_long,race=="other" & 
                                        (method=="zip2018_diff_count" | method=="county_diff_count" | 
                                           method=="blocks_diff_count" ) ),
                          aes(x=abs_count_diff,fill=method)) + theme_minimal() +
  theme(legend.text=element_text(size=8),legend.title = element_text(size=10))+
  geom_density(alpha=0.2)  + ylim(0,.125) +  xlim(0,150) + 
  labs(title="Other",y="Density",x="Abs. Diff. from reported race, per 1,000", fill="Method") + 
  scale_fill_discrete(name="Method", labels=c("Blocks","County","ZIP codes, 2018 ACS")) 
other_density_c

###saving count density plots 
density_plots2count <- ggarrange(white_density_c,black_density_c,asian_density_c,hispanic_density_c,
                                 other_density_c,common.legend=TRUE,ncol=2,nrow=3,
                                 legend="bottom")
density_plots2count

density_plots3count <- ggarrange(white_density_c,black_density_c,asian_density_c,hispanic_density_c,
                                 common.legend=TRUE,ncol=2,nrow=2,
                                 legend="bottom")
density_plots3count

ggsave("races_density_plots_count.png" ,plot=density_plots2count, scale=1,width=9,height=12,units = c("in"),dpi=600 )
ggsave("races_density_plots_countA.png" ,plot=density_plots3count, scale=1,width=7,height=7,units = c("in"),dpi=600 )

###now let's create the table 

race_table_count <- final_race_results %>% 
  group_by(race) %>% 
  dplyr::summarise(surname95=quantile(surname_diff_count,probs=c(0.95)),
                   county95=quantile(county_diff_count,probs=c(0.95)),
                   zip2010_95=quantile(zip2010_diff_count, probs=c(0.95)), 
                   zip2018_95=quantile(zip2018_diff_count,probs=c(0.95)),
                   tract95=quantile(tract_diff_count,probs=c(0.95)),
                   blocks95=quantile(blocks_diff_count,probs=c(0.95)) )
race_table_count <- as.data.frame(race_table_count)
race_table_count
race_table_countT <- t(race_table_count)
race_table_countT
race_table_countT <- as.data.frame(race_table_countT)
race_table_countT <- race_table_countT[-1,]
colnames(race_table_countT) <- c("Asian","Black","Hispanic","Other","White")
###col of names 
race_names <- as.data.frame(matrix(NA,nrow=6,ncol=1))
race_names[1,1] <- "Surname"
race_names[2,1] <- "County"
race_names[3,1] <- "ZCTA 2010"
race_names[4,1] <- "ZCTA 2018"
race_names[5,1] <- "Tract"
race_names[6,1] <- "Block"
###bind data 
race_table_countT <- cbind(race_names,race_table_countT)
names(race_table_countT)
colnames(race_table_countT)[1] <- "Level"
class(race_table_countT)
race_table_countT
for(i in 2:6){
  race_table_countT[,i] <- round( as.numeric(race_table_countT[,i]),2)
}

###now get the names fixed 
str(race_table_countT)
race_table_countT###Save data 

###transpose table save
saveRDS(race_table_countT, "race_error_table_count95ci.rds")
write.csv(race_table_countT, "race_error_table95_countci.csv",row.names = F)

###Meta script; Tables for the appendix 

geocoded_output <- read.csv("sampled_df_geocoded01222021B.csv")
names(geocoded_output)
unique(geocoded_output$Loc_name)
(table(geocoded_output$Loc_name)/nrow(geocoded_output))*100

geocoded_output_col <- geocoded_output %>% group_by(Loc_name) %>% tally()
geocoded_output_col2 <- as.data.frame(geocoded_output_col) 
geocoded_output_col2$Percent <- round((geocoded_output_col2$n/nrow(geocoded_output))*100,2)
geocoded_output_col2 <- geocoded_output_col2[order(-geocoded_output_col2$Percent), ] 
colnames(geocoded_output_col2)[1:2] <- c("Locator","Number")
write.csv(geocoded_output_col2, "geocoded_output_col2.csv", row.names = FALSE)

head(geocoded_output_col2) #table 3 of the appendix 

### table 2 for the appendix 
ga_geocoded_anon <- read.csv("ga_geocoded_anon.csv")
(table(ga_geocoded_anon$Loc_name)/nrow(ga_geocoded_anon))*100

##now export the table 
geocoded_output_col_anon <- ga_geocoded_anon %>% group_by(Loc_name) %>% tally()
geocoded_output_col2_anon  <- as.data.frame(geocoded_output_col_anon) 
geocoded_output_col2_anon$Percent <- round((geocoded_output_col2_anon$n/nrow(ga_geocoded_anon))*100,2)
geocoded_output_col2_anon  <- geocoded_output_col2_anon[order(-geocoded_output_col2_anon$Percent), ] 
colnames(geocoded_output_col2_anon )[1:2] <- c("Locator","Number")
###replace if missing 
geocoded_output_col2_anon$Locator[geocoded_output_col2_anon$Locator==""] <- "N/A"
write.csv(geocoded_output_col2_anon , "appendix_table2.csv", row.names = FALSE)
